{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#top 5k dice keywords\n",
    "KEY_WORDS_FILE = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt\"\n",
    "TOPN           = 30\n",
    "PAYLOAD_SYNONYMS_FILE  = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/top%i_keyword_synonyms.txt\" % TOPN\n",
    "SYNONYMS_FILE  = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/keywords.txt\"\n",
    "PHRASES_FILE = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/Phrases.txt\"\n",
    "MODEL_FILE     = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/keyword_model.w2v\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Shared\n",
    "#just used to load phrases file\n",
    "def load_stop_words(stop_words_file):\n",
    "    stop_words = set()\n",
    "    with open(stop_words_file) as f:\n",
    "            for line in f:\n",
    "                word = line.strip()\n",
    "                if word[0] != \"#\":\n",
    "                    word = word.lower()\n",
    "                    stop_words.add(word)\n",
    "    return stop_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#functions\n",
    "def map_keyword(kw):\n",
    "    return kw.replace(\" \", \"_\")\n",
    "\n",
    "def write_most_similar_synonyms(topn, key_words, model, expand_fname, map_fname):\n",
    "    key_words = set(key_words)\n",
    "    missing = set()\n",
    "    no_sim = set()\n",
    "    all_syns = set()\n",
    "    with open(expand_fname, \"w+\") as exp_f:\n",
    "        for word in key_words:\n",
    "            if not word in model.vocab:\n",
    "                missing.add(word)\n",
    "                continue\n",
    "            \n",
    "            top_matches = model.most_similar(positive=word, topn=topn*10)\n",
    "            valid = []\n",
    "            for t,sim in top_matches:\n",
    "                if t in key_words and sim > 0.01:\n",
    "                    valid.append((t,sim))\n",
    "                    if len(valid) >= topn:\n",
    "                        break\n",
    "                \n",
    "            if len(valid) > 0:\n",
    "                all_syns.add(word)\n",
    "                exp_f.write(\"%s=>\" % word)\n",
    "                for key, val in valid:\n",
    "                    all_syns.add(key)\n",
    "                    kw = map_keyword(key)                        \n",
    "                    exp_f.write(\"%s|%f \" %(kw,val))\n",
    "                exp_f.write(\"\\n\")\n",
    "            else:\n",
    "                no_sim.add(word)\n",
    "                #print(\"No matching similar terms in word2vec model for term: %s\" % word)\n",
    "    with open(map_fname, \"w+\") as f:\n",
    "        for syn in sorted(all_syns):\n",
    "            f.write(\"%s=>%s\\n\" % (syn, map_keyword(syn)))\n",
    "    return all_syns, missing, no_sim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import gensim, time\n",
    "from gensim.models.word2vec import Word2Vec\n",
    "\n",
    "model = Word2Vec.load(MODEL_FILE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "21278"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "keywords = load_stop_words(PHRASES_FILE)\n",
    "len(keywords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "24114 keywords loaded\n"
     ]
    }
   ],
   "source": [
    "with open(KEY_WORDS_FILE) as f:\n",
    "    for line in f:\n",
    "        kw = line.strip()\n",
    "        if len(kw) > 0:\n",
    "            keywords.add(kw)\n",
    "print(\"%i keywords loaded\" % (len(keywords)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "all_syns, missing, no_sim = write_most_similar_synonyms(TOPN, keywords, model, PAYLOAD_SYNONYMS_FILE, SYNONYMS_FILE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4962 26 24114\n"
     ]
    }
   ],
   "source": [
    "print len(missing), len(no_sim), len(keywords)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
